{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Translation using tensor2tensor on Cloud ML Engine\n",
    "\n",
    "This notebook illustrates using the <a href=\"https://github.com/tensorflow/tensor2tensor\">tensor2tensor</a> library to do from-scratch, distributed training of a English-German translator. Then, the trained model is deployed to Cloud ML Engine and used to translate new pieces of text.\n",
    "<p/>\n",
    "### Install tensor2tensor, and specify Google Cloud Platform project and bucket"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%bash\n",
    "pip install tensor2tensor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "PROJECT = 'cloud-training-demos' # REPLACE WITH YOUR PROJECT ID\n",
    "BUCKET = 'cloud-training-demos-ml' # REPLACE WITH YOUR BUCKET NAME\n",
    "REGION = 'us-central1' # REPLACE WITH YOUR BUCKET REGION e.g. us-central1\n",
    "\n",
    "\n",
    "# this is what this notebook is demonstrating\n",
    "PROBLEM= 'my_translate_problem'\n",
    "\n",
    "# for bash\n",
    "os.environ['PROJECT'] = PROJECT\n",
    "os.environ['BUCKET'] = BUCKET\n",
    "os.environ['REGION'] = REGION\n",
    "os.environ['PROBLEM'] = PROBLEM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "! gcloud config set project $PROJECT"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%bash\n",
    "wget http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz\n",
    "wget http://data.statmt.org/wmt17/translation-task/dev.tgz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dev.tgz  training-parallel-nc-v12.tgz\r\n"
     ]
    }
   ],
   "source": [
    "!ls *.tgz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set up problem\n",
    "The Problem in tensor2tensor is where you specify parameters like the size of your vocabulary and where to get the training data from."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%bash\n",
    "rm -rf t2t\n",
    "mkdir -p t2t/ende"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/content/t2t\r\n"
     ]
    }
   ],
   "source": [
    "!pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Writing t2t/ende/problem.py\n"
     ]
    }
   ],
   "source": [
    "%writefile t2t/ende/problem.py\n",
    "import tensorflow as tf\n",
    "from tensor2tensor.data_generators import generator_utils\n",
    "from tensor2tensor.data_generators import problem\n",
    "from tensor2tensor.data_generators import text_encoder\n",
    "from tensor2tensor.data_generators import translate\n",
    "from tensor2tensor.utils import registry\n",
    "\n",
    "#TOPDIR=\"gs://{}/translate_ende/\".format(\"BUCKET_NAME\")\n",
    "TOPDIR=\"file:///content/t2t\"  # Make sure this matches the !pwd above\n",
    "\n",
    "_ENDE_TRAIN_DATASETS = [\n",
    "    [\n",
    "        \"{}/training-parallel-nc-v12.tgz\".format(TOPDIR),\n",
    "        (\"training/news-commentary-v12.de-en.en\",\n",
    "         \"training/news-commentary-v12.de-en.de\")\n",
    "    ],\n",
    "]\n",
    "_ENDE_TEST_DATASETS = [\n",
    "    [\n",
    "        \"{}/dev.tgz\".format(TOPDIR),\n",
    "        (\"dev/newstest2013.en\", \"dev/newstest2013.de\")\n",
    "    ],\n",
    "]\n",
    "\n",
    "@registry.register_problem\n",
    "class MyTranslateProblem(translate.TranslateProblem):\n",
    "  @property\n",
    "  def targeted_vocab_size(self):\n",
    "    return 2**13  # 8192\n",
    "\n",
    "  @property\n",
    "  def vocab_name(self):\n",
    "    return \"vocab.english_to_german\"\n",
    "  \n",
    "  def generator(self, data_dir, tmp_dir, train):\n",
    "    symbolizer_vocab = generator_utils.get_or_generate_vocab(\n",
    "        data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, sources=_ENDE_TRAIN_DATASETS)\n",
    "    datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS\n",
    "    tag = \"train\" if train else \"dev\"\n",
    "    data_path = translate.compile_data(tmp_dir, datasets, \"wmt_ende_tok_%s\" % tag)\n",
    "    return translate.token_generator(data_path + \".lang1\", data_path + \".lang2\",\n",
    "                           symbolizer_vocab, text_encoder.EOS_ID)\n",
    "\n",
    "  @property\n",
    "  def input_space_id(self):\n",
    "    return problem.SpaceID.EN_TOK\n",
    "\n",
    "  @property\n",
    "  def target_space_id(self):\n",
    "    return problem.SpaceID.DE_TOK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Writing t2t/ende/__init__.py\n"
     ]
    }
   ],
   "source": [
    "%%writefile t2t/ende/__init__.py\n",
    "from . import problem"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Writing t2t/setup.py\n"
     ]
    }
   ],
   "source": [
    "%%writefile t2t/setup.py\n",
    "from setuptools import find_packages\n",
    "from setuptools import setup\n",
    "\n",
    "REQUIRED_PACKAGES = [\n",
    "  'tensor2tensor'\n",
    "]\n",
    "\n",
    "setup(\n",
    "    name='ende',\n",
    "    version='0.1',\n",
    "    author = 'Google',\n",
    "    author_email = 'training-feedback@cloud.google.com',\n",
    "    install_requires=REQUIRED_PACKAGES,\n",
    "    packages=find_packages(),\n",
    "    include_package_data=True,\n",
    "    description='My Translate Problem',\n",
    "    requires=[]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate training data \n",
    "\n",
    "Our problem (translation) requires the creation of text sequences from the training dataset.  This is done using t2t-datagen and the Problem defined in the previous section. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%bash\n",
    "DATA_DIR=./t2t_data\n",
    "TMP_DIR=$DATA_DIR/tmp\n",
    "rm -rf $DATA_DIR $TMP_DIR\n",
    "mkdir -p $DATA_DIR $TMP_DIR\n",
    "# Generate data\n",
    "t2t-datagen \\\n",
    "  --t2t_usr_dir=./t2t/ende \\\n",
    "  --problem=$PROBLEM \\\n",
    "  --data_dir=$DATA_DIR \\\n",
    "  --tmp_dir=$TMP_DIR"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Provide Cloud ML Engine access to data\n",
    "\n",
    "Copy the data to Google Cloud Storage, and then provide access to the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%bash\n",
    "DATA_DIR=./t2t_data\n",
    "gcloud storage rm --recursive gs://${BUCKET}/translate_ende/\n",
    "gcloud storage cp ${DATA_DIR}/${PROBLEM}* ${DATA_DIR}/vocab* gs://${BUCKET}/translate_ende/data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%bash\n",
    "PROJECT_ID=$PROJECT\n",
    "AUTH_TOKEN=$(gcloud auth print-access-token)\n",
    "SVC_ACCOUNT=$(curl -X GET -H \"Content-Type: application/json\" \\\n",
    "    -H \"Authorization: Bearer $AUTH_TOKEN\" \\\n",
    "    https://ml.googleapis.com/v1/projects/${PROJECT_ID}:getConfig \\\n",
    "    | python -c \"import json; import sys; response = json.load(sys.stdin); \\\n",
    "    print response['serviceAccount']\")\n",
    "\n",
    "echo \"Authorizing the Cloud ML Service account $SVC_ACCOUNT to access files in $BUCKET\"\n",
    "gcloud storage buckets update --add-default-object-acl-grant entity=user-$SVC_ACCOUNT,role=READER gs://$BUCKET\n",
    "gcloud storage objects update --add-acl-grant entity=user-$SVC_ACCOUNT,role=READER --recursive gs://$BUCKET  # error message (if bucket is empty) can be ignored\n",
    "gcloud storage buckets update --add-acl-grant entity=user-$SVC_ACCOUNT,role=WRITER gs://$BUCKET",
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train model as a Python package\n",
    "\n",
    "To submit the training job to Cloud Machine Learning Engine, we need a Python module with a main(). We'll use the t2t-trainer that is distributed with tensor2tensor as the main"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%bash\n",
    "wget https://raw.githubusercontent.com/tensorflow/tensor2tensor/master/tensor2tensor/bin/t2t-trainer\n",
    "mv t2t-trainer t2t/ende/t2t-trainer.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "!touch t2t/__init__.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "t2t\r\n",
      "t2t/__init__.py\r\n",
      "t2t/ende\r\n",
      "t2t/ende/__init__.py\r\n",
      "t2t/ende/__init__.pyc\r\n",
      "t2t/ende/problem.py\r\n",
      "t2t/ende/problem.pyc\r\n",
      "t2t/ende/t2t-trainer.py\r\n",
      "t2t/setup.py\r\n"
     ]
    }
   ],
   "source": [
    "!find t2t"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's test that the Python package works. Since we are running this locally, I'll try it out on a subset of the original data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%bash\n",
    "BASE=gs://${BUCKET}/translate_ende/data\n",
    "OUTDIR=gs://${BUCKET}/translate_ende/subset\n",
    "gcloud storage rm --recursive $OUTDIR\n",
    "gcloud storage cp \\\n",
    "    ${BASE}/${PROBLEM}-train-0008* \\\n",
    "    ${BASE}/${PROBLEM}-dev-00000*  \\\n",
    "    ${BASE}/vocab* \\\n",
    "    $OUTDIR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%bash\n",
    "OUTDIR=./trained_model\n",
    "rm -rf $OUTDIR\n",
    "export PYTHONPATH=${PYTHONPATH}:${PWD}/t2t\n",
    "python -m ende.t2t-trainer \\\n",
    "  --data_dir=gs://${BUCKET}/translate_ende/subset \\\n",
    "  --problems=$PROBLEM \\\n",
    "  --model=transformer \\\n",
    "  --hparams_set=transformer_base_single_gpu \\\n",
    "  --output_dir=$OUTDIR --job-dir=$OUTDIR"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train on Cloud ML Engine\n",
    "\n",
    "Once we have a working Python package, training on a Cloud ML Engine GPU is straightforward"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%bash\n",
    "OUTDIR=gs://${BUCKET}/translate_ende/model\n",
    "JOBNAME=t2t_$(date -u +%y%m%d_%H%M%S)\n",
    "echo $OUTDIR $REGION $JOBNAME\n",
    "gcloud storage rm --recursive --continue-on-error $OUTDIR\n",
    "gcloud ml-engine jobs submit training $JOBNAME \\\n",
    "   --region=$REGION \\\n",
    "   --staging-bucket=gs://$BUCKET \\\n",
    "   --scale-tier=BASIC_GPU \\\n",
    "   --module-name=ende.t2t-trainer \\\n",
    "   --package-path=${PWD}/t2t/ende \\\n",
    "   --job-dir=$OUTDIR \\\n",
    "   --runtime-version=1.4 \\\n",
    "   -- \\\n",
    "  --data_dir=gs://${BUCKET}/translate_ende/data \\\n",
    "  --problems=my_translate_problem \\\n",
    "  --model=transformer \\\n",
    "  --hparams_set=transformer_base_single_gpu \\\n",
    "  --output_dir=$OUTDIR"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Copyright 2018 Google Inc. Licensed under the Apache License, Version 2.0 (the \\\"License\\\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \\\"AS IS\\\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
